"""
@ Title: Assignment 4
@ Author: 商研1 鄭子萱 r07741023
@ Date: 2019/4/27
"""
#-*- coding:utf-8 -*-
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
plt.style.use('ggplot')
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus']=False # 正常顯示正負號
import warnings
warnings.filterwarnings("ignore")
stu_adm = pd.read_csv('ds/student_admission106.csv', encoding="utf-8", dtype=str)
uname = pd.read_csv('ds/univ_name106short1.csv', encoding="utf-8", dtype=str)
Report the number of academic department and student applicants in your dataset.
# transform to matrix
stu_adm['apply'] = 1
df = stu_adm.pivot(index = 'department_id', columns ='student_id', values = 'apply').fillna(0)
# filter by 2 conditions
r = -1
c = -1
while True:
df = df[df.sum(axis=1) >= 10]
df = df[df.columns[df.sum() >= 2]]
if (df.shape[0] == r) & (df.shape[1] == c):
break
r = df.shape[0]
c = df.shape[1]
print("number of departments %s\nnumber of students: %s" % (r,c))
Report the top ten departments that recieved the most applications and the number of applications they received. Identify the department by their department_id and names.
# top ten
top = df.sum(axis=1).sort_values(ascending = False)
top = pd.DataFrame(top.head(n = 10), columns = ['number of applications'])
top = pd.merge(top, uname[['department_id','department_name']], on='department_id')
top
# transform dataframe to array-type
df_copy = pd.merge(df, uname, on='department_id')
df_copy['category_code'] = pd.factorize(df_copy['category_name'])[0] + 1
X = df.values
y = df_copy['category_code'].values
target_names = df_copy['category_name'].unique()
target_index = df_copy['category_code'].unique()
Visualize academic departments using the first 8 principle components. Use your judegement to select multiple pairs of principle components to visualize. Discuss the visual patterns with respect to department categories.
from sklearn.decomposition import PCA
pca = PCA(n_components=8) # 維度=8
x_r = pca.fit_transform(X)
# # colors setting
# from matplotlib import colors as mcolors
# colors = dict(mcolors.BASE_COLORS, **mcolors.CSS4_COLORS)
# color = []
# for key, value in colors.items() :
# color.append(key)
plt.figure(figsize=(10,5))
colors = color[35:45]
lw = 0.5
for i, c, target_name in zip(target_index, colors, target_names):
plt.scatter(x_r[y == i, 0], x_r[y == i, 1], lw=lw,c=c,
label=target_name)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.title('PCA')
plt.xlabel('pc1')
plt.ylabel('pc2')
#plt.tight_layout()
explained_variance_ratio = pca.explained_variance_ratio_
plt.figure(figsize=(10,5))
plt.bar(range(len(explained_variance_ratio)), explained_variance_ratio, alpha=0.5, align='center', label='Explained variance ratio')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal components')
plt.legend(loc='best')
plt.title("Explained Variance Ratio ", fontsize=20)
plt.step(range(len(explained_variance_ratio)), np.cumsum(explained_variance_ratio), where='mid',label='cumulative explained variance')
Visualize academic department using multiple dimensional scaling. Consdier both the metric and non-metric settings. Discuss the result.
from sklearn import manifold
from sklearn.metrics import euclidean_distances
from sklearn.decomposition import PCA
mds = manifold.MDS()
pos = mds.fit_transform(X)
plt.figure(figsize=(10,5))
#colors = ['black', 'blue', 'purple', 'yellow', 'white', 'red', 'lime', 'cyan', 'orange', 'gray', 'navy']
lw = 0.5
for i, c, target_name in zip(target_index, colors, target_names):
plt.scatter(pos[y == i, 0], pos[y == i, 1], lw=lw,label=target_name, c=c)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.xlabel('pc1')
plt.ylabel('pc2')
plt.title('metric MSD')
nmds = manifold.MDS(n_components=8)
npos = nmds.fit_transform(X)
plt.figure(figsize=(10,5))
lw = 0.5
for i, c, target_name in zip(target_index, colors, target_names):
plt.scatter(npos[y == i, 0], npos[y == i, 1], lw=lw, c=c,
label=target_name)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.xlabel('pc1')
plt.ylabel('pc2')
plt.title('non-metric MSD')
Visualize academic department using Locally Linear Embedding. Consider three variations: (2) Use 40 neighbors to construct the weight matrix; (3) Perform PCA transformation first, and use the first 100 principle components as the input to LLE (with 20 neighbors). Discuss the result.
(1) Use 20 neighbors to construct the weight matrix
from sklearn.manifold import LocallyLinearEmbedding
embedding = LocallyLinearEmbedding(n_neighbors = 20)
x_r = embedding.fit_transform(X)
plt.figure(figsize=(10,5))
lw = 0.5
for i, c, target_name in zip(target_index, colors, target_names):
plt.scatter(x_r[y == i, 0], x_r[y == i, 1], lw=lw, c=c,
label=target_name)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.xlabel('pc1')
plt.ylabel('pc2')
plt.title('LLE using 20 neighbors')
(2) Use 40 neighbors to construct the weight matrix
embedding = LocallyLinearEmbedding(n_neighbors = 40)
x_r_2 = embedding.fit_transform(X)
plt.figure(figsize=(10,5))
lw = 0.5
for i, c, target_name in zip(target_index, colors, target_names):
plt.scatter(x_r_2[y == i, 0], x_r_2[y == i, 1], lw=lw, c=c,
label=target_name)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.xlabel('pc1')
plt.ylabel('pc2')
plt.title('LLE using 40 neighbors')
(3) Perform PCA transformation first, and use the first 100 principle components as the input to LLE (with 20 neighbors)
# pca first
pca = PCA(n_components = 100)
x_r_pca = pca.fit_transform(X)
embedding = LocallyLinearEmbedding(n_neighbors = 20)
x_r_3 = embedding.fit_transform(x_r_pca)
plt.figure(figsize=(10,5))
lw = 0.5
for i, c, target_name in zip(target_index, colors, target_names):
plt.scatter(x_r_3[y == i, 0], x_r_3[y == i, 1], lw=lw, c=c,
label=target_name)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.xlabel('pc1')
plt.ylabel('pc2')
plt.title('LLE using 20 neighbors with 100 pca')
Visualize academic department using Kernel PCA. You should at least consider the RBF and Cosine kernel. It is your responsibility to select reasonably good kernel parameters. Discuss the result.
(1) using RBF
from sklearn.decomposition import KernelPCA
transformer = KernelPCA(kernel='rbf')
x_r_4 = transformer.fit_transform(X)
plt.figure(figsize=(10,5))
lw = 0.5
for i, c, target_name in zip(target_index, colors, target_names):
plt.scatter(x_r_4[y == i, 0], x_r_4[y == i, 1], lw=lw, c=c,
label=target_name)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.xlabel('pc1')
plt.ylabel('pc2')
plt.title('KernalPCA using RBF')
(2) using cosine
transformer = KernelPCA(kernel='cosine')
x_r_5 = transformer.fit_transform(X)
plt.figure(figsize=(10,5))
lw = 0.5
for i, c, target_name in zip(target_index, colors, target_names):
plt.scatter(x_r_5[y == i, 0], x_r_5[y == i, 1], lw=lw, c=c,
label=target_name)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.xlabel('pc1')
plt.ylabel('pc2')
plt.title('KernalPCA using cosine')
Visualize academic department using t-SNE. You should consider at least the Euclidian, Cosine, and Jaccard metric. Set numpy random seed so that your results can be repeated. Discuss the result.
seed = np.random.RandomState(seed=3)
(1) using Euclidian
from sklearn.manifold import TSNE
x_r_6 = TSNE(metric="euclidean", random_state=seed).fit_transform(X)
plt.figure(figsize=(10,5))
lw = 0.5
for i, c, target_name in zip(target_index, colors, target_names):
plt.scatter(x_r_6[y == i, 0], x_r_6[y == i, 1], lw=lw, c=c,
label=target_name)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.xlabel('pc1')
plt.ylabel('pc2')
plt.title('t-SNE using euclidean')
(2) using cosine
x_r_7 = TSNE(metric="cosine", random_state=seed).fit_transform(X)
plt.figure(figsize=(10,5))
lw = 0.5
for i, c, target_name in zip(target_index, colors, target_names):
plt.scatter(x_r_7[y == i, 0], x_r_7[y == i, 1], lw=lw, c=c,
label=target_name)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.xlabel('pc1')
plt.ylabel('pc2')
plt.title('t-SNE using cosine')
(2) using Jaccard
x_r_8 = TSNE(metric="jaccard", random_state=seed).fit_transform(X)
plt.figure(figsize=(10,5))
lw = 0.5
for i, c, target_name in zip(target_index, colors, target_names):
plt.scatter(x_r_8[y == i, 0], x_r_8[y == i, 1], lw=lw, c=c,
label=target_name)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.xlabel('pc1')
plt.ylabel('pc2')
plt.title('t-SNE using Jaccard')
fig, ax = plt.subplots(figsize=(60,30))
lw = 0.5
for i, c, target_name in zip(target_index, colors, target_names):
plt.scatter(x_r_7[y == i, 0], x_r_7[y == i, 1], lw=lw, c=c,
label=target_name, s=30)
for i, txt in enumerate(df_copy['department_name']):
ax.annotate(txt, (x_r_7[i,0]+0.1, x_r_7[i,1]+0.1) )
plt.legend()
plt.xlabel('pc1')
plt.ylabel('pc2')
plt.title('t-SNE using cosine')
plt.tight_layout()
plt.savefig('t-sne.png', dpi=300)